library(tidyverse)
## ── Attaching packages ────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.3
## ✓ tibble 3.0.0 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readr)
library(wordbankr)
library(here)
## here() starts at /cloud/project
library(RColorBrewer)
library(wesanderson)
library(ggthemes)
library(beyonce)
## Registered S3 method overwritten by 'beyonce':
## method from
## print.palette wesanderson
library(viridis)
## Loading required package: viridisLite
library(forcats)
library(colorblindr)
## Loading required package: colorspace
library(ggrepel)
sounds <- read_csv(here::here("data", "animal_sounds_summary.csv"))
## Parsed with column specification:
## cols(
## age = col_double(),
## sound = col_character(),
## kids_produce = col_double(),
## kids_understand = col_double(),
## kids_respond = col_double(),
## prop_produce = col_double(),
## prop_understand = col_double()
## )
glimpse(sounds)
## Rows: 33
## Columns: 7
## $ age <dbl> 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12,…
## $ sound <chr> "cockadoodledoo", "meow", "woof woof", "cockadoodledo…
## $ kids_produce <dbl> 1, 0, 3, 0, 2, 2, 0, 5, 4, 0, 5, 12, 0, 12, 28, 9, 12…
## $ kids_understand <dbl> 3, 10, 12, 2, 21, 22, 9, 41, 40, 4, 36, 32, 16, 59, 5…
## $ kids_respond <dbl> 35, 35, 35, 91, 93, 93, 139, 145, 143, 94, 94, 94, 14…
## $ prop_produce <dbl> 0.02857143, 0.00000000, 0.08571429, 0.00000000, 0.021…
## $ prop_understand <dbl> 0.08571429, 0.28571429, 0.34285714, 0.02197802, 0.225…
sounds %>%
distinct(sound) %>%
knitr::kable()
| sound |
|---|
| cockadoodledoo |
| meow |
| woof woof |
sounds %>%
distinct(age) %>%
pull
## [1] 8 9 10 11 12 13 14 15 16 17 18
names(sounds)
## [1] "age" "sound" "kids_produce" "kids_understand"
## [5] "kids_respond" "prop_produce" "prop_understand"
sounds %>%
group_by(sound) %>%
summarize(total_produce = sum(kids_produce)) %>%
knitr::kable()
| sound | total_produce |
|---|---|
| cockadoodledoo | 148 |
| meow | 681 |
| woof woof | 940 |
| ## # Initial EDA | Plots |
ggplot(sounds, aes(x = sound, y = kids_produce)) +
geom_col() +
labs(x = "Sound", y = "Total Children Producing")
ggplot(sounds, aes(x = age, y = prop_produce)) +
geom_col() +
labs(x = "Age (mos)", y = "Proportion of Children Producing") +
facet_wrap(~sound)
## Scatter Plot
ggplot(sounds, aes(x = age, y = prop_produce)) +
geom_point() +
labs(x = "Age (mos)", y = "Proportion of Children Producing") +
facet_wrap(~sound)
## Discrete Plot
ggplot(sounds, aes(x = age, y = prop_produce)) +
geom_point(size = 2) +
labs(x = "Age (months)", y = "Proportion of Children Producing")
sounds%>% count()
## # A tibble: 1 x 1
## n
## <int>
## 1 33
Remember: Make sureto adjust the labels!!
ggplot(sounds, aes(x = age, y = prop_produce)) +
geom_point(aes(color = sound), size = 2) +
labs(x = "Age (months)", y = "Proportion of Children Producing")
ggplot(sounds, aes(x = age, y = prop_produce)) +
geom_line() +
geom_point(aes(color = sound), size = 2) +
labs(x = "Age (months)", y = "Proportion of Children Producing")
# A possible solution
ggplot(sounds, aes(x = age, y = prop_produce)) +
geom_line(aes(group = sound)) +
geom_point(aes(color = sound), size = 2) +
labs(x = "Age (months)", y = "Proportion of Children Producing")
# Does this look right? yes!
ggplot(sounds, aes(x = age, y = prop_produce, color = sound)) +
geom_line() +
geom_point(size = 2) +
labs(x = "Age (months)", y = "Proportion of Children Producing")
geom_smooth()ggplot(sounds, aes(x = age,
y = prop_produce,
color = sound)) +
geom_smooth(se = FALSE, lwd = .5) +
geom_point(size = 2) +
labs(x = "Age (months)", y = "Proportion of Children Producing")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Controlling factor order
library(forcats)
sounds <- sounds %>%
mutate(sound = as.factor(sound))
sound_traj <- ggplot(sounds, aes(x = age,
y = prop_produce,
color = fct_reorder2(sound, age, prop_produce))) +
geom_smooth(se = FALSE, lwd = .5) +
geom_point(size = 2) +
labs(x = "Age (months)",
y = "Proportion of Children Producing",
color = "sound")
sound_traj
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Experiment with each property in scale_color_hue() to get a sense of what it does.
sound_traj +
scale_color_hue()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Change hue (l and c are defaults)
sound_traj +
scale_color_hue(h = c(0, 90), l = 65, c = 100)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Use luminance=45, instead of default 65
sound_traj +
scale_color_hue(l = 45)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# Reduce saturation (chroma) from 100 to 50, and increase luminance
sound_traj +
scale_color_hue(l = 75, c = 50)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
sound_traj +
scale_color_manual(values = c("cornflowerblue",
"seagreen", "coral"))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Experiment with scale_color_manual() and some of the various named colors that come built-in to R!
Why doesn’t the code block change the colors?
ggplot(sounds, aes(x = age,
y = prop_produce,
color = fct_reorder2(sound, age, prop_produce))) +
geom_smooth(se = FALSE, lwd = .5) +
geom_point(size = 2) +
labs(x = "Age (months)",
y = "Proportion of Children Producing",
color = "sound") +
scale_fill_manual(values = c("cornflowerblue",
"seagreen", "coral"))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(sounds, aes(x = age,
y = prop_produce,
fill = fct_reorder2(sound, age, prop_produce))) +
geom_smooth(se = FALSE, lwd = .5) +
geom_point(size = 2) +
labs(x = "Age (months)",
y = "Proportion of Children Producing",
fill = "sound") +
scale_fill_manual(values = c("cornflowerblue",
"seagreen", "coral"))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
sound_traj
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(sounds, aes(x = age,
y = prop_produce,
fill = fct_reorder2(sound, age, prop_produce))) +
geom_smooth(aes(color = fct_reorder2(sound, age, prop_produce)),
se = FALSE, lwd = .5, show.legend = FALSE) +
geom_point(size = 2, shape = 21) +
labs(x = "Age (months)",
y = "Proportion of Children Producing",
fill = "sound")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(sounds, aes(x = age,
y = prop_produce,
fill = fct_reorder2(sound, age, prop_produce))) +
geom_smooth(aes(color = fct_reorder2(sound, age, prop_produce)),
se = FALSE, lwd = .5, show.legend = FALSE) +
geom_point(size = 2, shape = 21) +
labs(x = "Age (months)",
y = "Proportion of Children Producing",
fill = "sound") +
scale_fill_manual(values = c("cornflowerblue",
"seagreen", "coral")) +
scale_color_manual(values = c("cornflowerblue",
"seagreen", "coral"))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
my_colors <- c("cadetblue", "steelblue", "salmon") # quote color names
sound_traj +
scale_color_manual(values = my_colors) # note: not in quotes
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# from https://github.com/mwaskom/seaborn/blob/master/seaborn/palettes.py
sb_colorblind <- c("#0072B2", "#009E73", "#D55E00",
"#CC79A7", "#F0E442", "#56B4E9")
sound_traj +
scale_colour_manual(values = sb_colorblind)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
scale_color_brewer()library(RColorBrewer)
brewer.pal(5, "Dark2") # list 5 hex colors
## [1] "#1B9E77" "#D95F02" "#7570B3" "#E7298A" "#66A61E"
display.brewer.pal(5, "Dark2") # view 5 hex colors
sound_traj +
scale_color_brewer(palette = "Dark2")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
wesandersonlibrary(wesanderson)
names(wes_palettes) # all the palette names
## [1] "BottleRocket1" "BottleRocket2" "Rushmore1" "Rushmore"
## [5] "Royal1" "Royal2" "Zissou1" "Darjeeling1"
## [9] "Darjeeling2" "Chevalier1" "FantasticFox1" "Moonrise1"
## [13] "Moonrise2" "Moonrise3" "Cavalcanti1" "GrandBudapest1"
## [17] "GrandBudapest2" "IsleofDogs1" "IsleofDogs2"
wes_palette("GrandBudapest2") # view named palette
wes_palette("GrandBudapest2")[1:4] # list first 4 hex colors
## [1] "#E6A0C4" "#C6CDF7" "#D8A499" "#7294D4"
wes_palette("GrandBudapest2")[c(1,4)] # list colors 1 and 4
## [1] "#E6A0C4" "#7294D4"
sound_traj +
scale_color_manual(values = wes_palette("Darjeeling1"))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
sound_traj +
scale_color_manual(values = wes_palette("FantasticFox1"))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
sound_traj +
scale_color_manual(values = wes_palette("Darjeeling1")[3:5])
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
sound_traj +
scale_color_manual(values = wes_palette("FantasticFox1")[c(2, 3, 5)])
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggthemeslibrary(ggthemes)
sound_traj +
scale_color_fivethirtyeight()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
sound_traj +
scale_color_economist()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
beyoncelibrary(beyonce)
beyonce_palette(18)
sound_traj +
scale_color_manual(values = beyonce_palette(18)[3:5])
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
sound_traj +
scale_color_manual(values = beyonce_palette(18)[c(1, 4, 5)])
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
viridissound_traj +
scale_color_viridis(discrete = TRUE) +
theme_minimal()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
sound_traj +
scale_color_viridis(discrete = TRUE, option = "plasma") +
theme_minimal()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(sounds, aes(x = age,
y = prop_produce,
fill = fct_reorder2(sound, age, prop_produce))) +
geom_smooth(aes(color = fct_reorder2(sound, age, prop_produce)),
se = FALSE, lwd = .5, show.legend = FALSE) +
geom_point(size = 2, shape = 21, colour = "midnightblue") +
labs(x = "Age (months)",
y = "Proportion of Children Producing",
fill = "sound") +
scale_fill_viridis(discrete = TRUE) +
scale_color_viridis(discrete = TRUE) +
theme_minimal()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Experimenting with scale_color_grey()/scale_fill_grey()
sound_traj +
scale_color_grey() +
theme_minimal()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
sound_traj +
scale_color_grey(start = 0.2, end = .8)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(sounds, aes(x = age,
y = prop_produce,
fill = fct_reorder2(sound, age, prop_produce))) +
geom_smooth(aes(color = fct_reorder2(sound, age, prop_produce)),
se = FALSE, lwd = .5, show.legend = FALSE) +
geom_point(size = 2, shape = 21) +
labs(x = "Age (months)",
y = "Proportion of Children Producing",
fill = "sound") +
scale_fill_grey(start = 0.3, end = 1) +
scale_color_grey(start = 0.3, end = 1)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(sounds, aes(x = age,
y = prop_produce,
fill = fct_reorder2(sound, age, prop_produce))) +
geom_smooth(aes(lty = fct_reorder2(sound, age, prop_produce)), color = "black",
se = FALSE, lwd = .5, show.legend = FALSE) +
geom_point(size = 2, shape = 21) +
labs(x = "Age (months)",
y = "Proportion of Children Producing",
fill = "sound") +
scale_fill_grey(start = 0.3, end = 1)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplot(sounds, aes(x = age,
y = prop_produce,
fill = fct_reorder2(sound, age, prop_produce))) +
geom_smooth(aes(color = fct_reorder2(sound, age, prop_produce),
lty = fct_reorder2(sound, age, prop_produce)),
se = FALSE, lwd = .5, show.legend = FALSE) +
geom_point(size = 2, shape = 21) +
labs(x = "Age (months)",
y = "Proportion of Children Producing",
fill = "sound") +
scale_fill_grey(start = 0.3, end = .8) +
scale_color_grey(start = 0.3, end = .8)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
colorblindrmy_sound_traj <- sound_traj +
scale_color_manual(values = beyonce_palette(18)[c(1, 4, 5)])
library(colorblindr)
cvd_grid(my_sound_traj)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
cb_sound_traj <- sound_traj +
scale_color_OkabeIto()
cb_sound_traj
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
cvd_grid(cb_sound_traj)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
# To use for line and point colors, add
sound_traj +
scale_colour_manual(values = cbbPalette[c(3, 7, 8)])
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
library(ggrepel)
sounds <- sounds %>%
mutate(label = case_when(
age == max(age) ~ sound))
ggplot(sounds, aes(x = age,
y = prop_produce,
color = fct_reorder2(sound, age, prop_produce))) +
geom_smooth(se = FALSE, lwd = .5) +
geom_point(size = 2) +
labs(x = "Age (months)",
y = "Proportion of Children Producing") +
geom_text_repel(aes(label = label),
nudge_x = 1,
direction = "y",
na.rm = TRUE) +
guides(color = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Experiment a bit with scale_color_gradient()/scale_fill_gradient()!
sound_by_age <- ggplot(sounds, aes(x = age,
y = prop_produce,
color = age)) +
geom_line(aes(group = sound), lwd = .5) +
geom_point(size = 2) +
labs(x = "Age (months)",
y = "Proportion of Children Producing")
sound_by_age
sound_by_age +
scale_color_gradient()
sound_by_age +
scale_color_gradient(trans = "reverse")
sound_by_age +
scale_color_gradient(low = "white", high = "red")
sound_by_age +
scale_color_gradient(low = "grey90", high = "black")
# Diverging color scheme
med_age <- sounds %>%
summarize(mos = median(age)) %>%
pull()
sound_by_age +
scale_color_gradient2(midpoint = med_age,
low="blue", mid="white", high="red" )
sound_by_age +
scale_color_gradientn(colours = brewer.pal(n=5, name="PuBuGn"))
sound_by_age +
scale_color_gradientn(colours = rev(brewer.pal(n=5, name="PuBuGn")))
Experiment a bit with RColorBrewer and viridis
sound_by_age +
scale_color_viridis()
sound_by_age +
scale_color_viridis(option = "magma")
sound_by_age +
scale_color_viridis(option = "inferno", begin = 1, end = 0)
salary<-read_csv("adult-training.csv")
## Parsed with column specification:
## cols(
## `39` = col_double(),
## `State-gov` = col_character(),
## `77516` = col_double(),
## Bachelors = col_character(),
## `13` = col_double(),
## `Never-married` = col_character(),
## `Adm-clerical` = col_character(),
## `Not-in-family` = col_character(),
## White = col_character(),
## Male = col_character(),
## `2174` = col_double(),
## `0` = col_double(),
## `40` = col_double(),
## `United-States` = col_character(),
## `<=50K` = col_character()
## )
colnames(salary)<- c("Age","workclass","fnlwgt","education","education_num","marital_status","occupation","relationship","race","gender","capital_gain","capital_loss","hours_per_week","native_country","income_bracket")
salary
## # A tibble: 32,560 x 15
## Age workclass fnlwgt education education_num marital_status occupation
## <dbl> <chr> <dbl> <chr> <dbl> <chr> <chr>
## 1 50 Self-emp… 83311 Bachelors 13 Married-civ-s… Exec-mana…
## 2 38 Private 215646 HS-grad 9 Divorced Handlers-…
## 3 53 Private 234721 11th 7 Married-civ-s… Handlers-…
## 4 28 Private 338409 Bachelors 13 Married-civ-s… Prof-spec…
## 5 37 Private 284582 Masters 14 Married-civ-s… Exec-mana…
## 6 49 Private 160187 9th 5 Married-spous… Other-ser…
## 7 52 Self-emp… 209642 HS-grad 9 Married-civ-s… Exec-mana…
## 8 31 Private 45781 Masters 14 Never-married Prof-spec…
## 9 42 Private 159449 Bachelors 13 Married-civ-s… Exec-mana…
## 10 37 Private 280464 Some-col… 10 Married-civ-s… Exec-mana…
## # … with 32,550 more rows, and 8 more variables: relationship <chr>,
## # race <chr>, gender <chr>, capital_gain <dbl>, capital_loss <dbl>,
## # hours_per_week <dbl>, native_country <chr>, income_bracket <chr>
salary %>%
group_by(race) %>%
summarize(total_capital_gain = sum(capital_gain)) %>%
knitr::kable()
| race | total_capital_gain |
|---|---|
| Amer-Indian-Eskimo | 194458 |
| Asian-Pac-Islander | 1536014 |
| Black | 1905454 |
| Other | 253293 |
| White | 31197931 |
salary_untidy<-salary%>%
filter(capital_gain<25000)%>%
select(Age,education_num,capital_gain,gender)%>%
group_by(education_num,gender)%>%
mutate(Average = mean(capital_gain))
salary_untidy
## # A tibble: 32,345 x 5
## # Groups: education_num, gender [32]
## Age education_num capital_gain gender Average
## <dbl> <dbl> <dbl> <chr> <dbl>
## 1 50 13 0 Male 1116.
## 2 38 9 0 Male 395.
## 3 53 7 0 Male 266.
## 4 28 13 0 Female 496.
## 5 37 14 0 Female 1130.
## 6 49 5 0 Female 87.1
## 7 52 9 0 Male 395.
## 8 31 14 14084 Female 1130.
## 9 42 13 5178 Male 1116.
## 10 37 10 0 Male 512.
## # … with 32,335 more rows
salary_untidy_plot<-ggplot(salary_untidy,aes(x=education_num,y=Average))+
geom_col(aes(fill=gender),position = "dodge",width = .6,na.rm = FALSE)
salary_untidy_plot1<-salary_untidy_plot+
scale_fill_manual(values = c('#E69F00','#0072B2'),name = "Gender")+
theme(plot.title = element_text(hjust = 0.5)) +
theme(axis.text = element_text(size = 8)) +
labs(x = "Education Index", y = "Average Capital gain") +
scale_x_continuous(expand = c(0, 0),breaks = seq(0, 16, 2))+
scale_y_continuous(expand = c(0, 0),breaks = seq(0, 2100, 300))+
theme(panel.background = element_blank(),axis.line = element_line(colour = "black")) +
theme(axis.text = element_text(size = 10))+
ggtitle(~""*underline("US Adult Census data"))
salary_untidy_plot1
salary_untidy_plot_colorblind<-salary_untidy_plot1+
scale_color_OkabeIto()
salary_untidy_plot_colorblind
cvd_grid(salary_untidy_plot1)
This dataset is based upon US Adult income and the source is https://www.kaggle.com/uciml/adult-census-income.I took the required columns and aggregated the capital income based on different education levels as part of my data wrangling process. Also, I tried with different geom_plots() like scatter, line or point. Since my dataset set has lots of rows, I was not able to see any clear pattern in other geom plots, so I choose geom_col() and for this dataset aesthetically it looks better.
To improve my data representation, I used geom_col to plot the processed data, removed back ground elements , changed font size of the axis text, underlined the plot title, highlighted the axis lines, changed the legend titles, put logical aesthetic sequence and so on.
I took capital income based on education level and analysed how average capital income varies for Male and Female based on education level. The education index varies from 1 to 16, 1 is the lowest and 16 is the highest. My plot shows the trend in capital gain for male and female having different education level. As the education level increases the average capital gain increases for both male and female.However, for the same education level Male have higher capital gain as compared to the females.
Initially i choose purple and magenta colors to specify Male and Female respetively but when I did color-vision-deficiency simulation using “cvd_grid” , the “Desaturated” plot was not distinguishing Genders. The shades for both Male and Female was shoing quite similar. So then I changed the color of my plot fill and validated again , now the distinguigh between male and female data is visible for all four types of colorblindness.
salary_untidy_plot1+
scale_fill_grey() +
theme(panel.background = element_blank())
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
salary_untidy_plot2<-ggplot(salary_untidy,aes(x=education_num,y=Average))+
geom_col(aes(fill=gender),position = "dodge",width = .6,na.rm = FALSE)+
labs(x = "Education Index", y = "Average Capital gain")+
scale_fill_manual(values = c('#FFFF00','#66FF00'))+
theme(panel.grid.major = element_line(colour = "red", linetype = "dotted"),
panel.grid.minor = element_line(colour = "blue", linetype = "dotted"))
salary_untidy_plot2
cvd_grid(salary_untidy_plot2)
The above plot seems very confusing, the background red and blue grids makes the plot’s visualisation bad. There is no header which shows less informative.The space above aesthetics are not good visualization. Also, the color choice of yellow and green for gender distinction is not good.These colors also failed the colorblindness simulation.As we can see the desaturated plot failed to show the male-female distiction. The deutanomaly and protanomaly plots are not not clear.